1 Load R packages and read in data

Code
#load necessary packages
library(vcfR)
library(SNPfiltR)

#read in unlinked SNP dataset
v <- read.vcfR("~/Desktop/nmel.ceyx.rad/filtered.unlinked.snps.vcf.gz")

2 subsample vcf for snapper

Code
#print vcf data
v
***** Object of Class vcfR *****
107 samples
2457 CHROMs
2,580 variants
Object size: 22.4 Mb
3.187 percent missing data
*****        *****         *****
Code
#read in sampling details
sample.info<-read.csv("~/Desktop/nmel.ceyx.rad/ceyx.full.sampling.csv")
#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
samps<-sample.info[sample.info$ID %in% colnames(v@gt)[-1],]
#reorder the sample info file to match the order of samples in the vcf
samps<-samps[match(colnames(v@gt)[-1],samps$ID),]
#make sure it worked (should be all true)
samps$ID == colnames(v@gt)[-1]
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE
Code
samps$admix.assign<-as.factor(samps$admix.assign)

#use a for loop to randomly downsample 2 samples from each of the 11 lineages 3 separate times
for (i in 1:3){
  
  #generate a set of 10 randomly selected samples (one from each species) using the sampling df called 'samps'
  x<-c() #make an empty vector called 'x' to hold this list
  for (j in 1:length(levels(samps$admix.assign))){
    #if there's less than one sample for a species, just add that sample
    if(length(samps$ID[samps$admix.assign == levels(samps$admix.assign)[j]]) == 1){
      x<-c(x,samps$ID[samps$admix.assign == levels(samps$admix.assign)[j]])
    }
    #if there's more than one sample for a species, randomly sample two and add them to the vector
    else{
        x<-c(x,sample(samps$ID[samps$admix.assign == levels(samps$admix.assign)[j]], size=2))
    }
  }
  #add "FORMAT" to 'x' so that the vcf info column is retained
  x<-c(x, "FORMAT")
  
  #subset the vcf to only the random samples plus the vcfR info (column 1)
  vcf.sub <- v[,colnames(v@gt) %in% x]
  #filter out invariant sites
  vcf.comp<-min_mac(vcf.sub, min.mac = 1)

  #write filtered subset vcf to disk
  print(colnames(vcf.comp@gt)) #print sample names in retained vcf
  print(samps$admix.assign[samps$ID %in% colnames(vcf.comp@gt)]) #print the assigned taxa for each sample to make sure we have subsampled correctly
  #commented to prevent overwriting the exact vcfs used in this analysis
  #vcfR::write.vcf(vcf.comp, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/rep",i,".vcf.gz")) #write to disk
  
  #write out a popmap that corresponds to this particular replicate:
  #define dataframe
  zz<-cbind.data.frame(samps$admix.assign[samps$ID %in% colnames(vcf.comp@gt)],colnames(vcf.comp@gt)[-1])
  colnames(zz)<-c("species","individual") #fix colnames
  #write to disk
  #commented to prevent overwriting
  #write.table(zz, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/popmap.rep",i,".txt"), sep = "\t", quote = F, row.names = F)
  }
35.62% of SNPs fell below a minor allele count of 1 and were removed from the VCF

 [1] "FORMAT"               "C_solitarius_7604"    "C_solitarius_9572"   
 [4] "C_nigromaxilla_15907" "C_margarethae_27254"  "C_margarethae_27261" 
 [7] "C_sacerdotis_27646"   "C_mulcatus_27674"     "C_mulcatus_27686"    
[10] "C_sacerdotis_29529"   "C_malaitae_32770"     "C_collectoris_33757" 
[13] "C_collectoris_33789"  "C_meeki_34848"        "C_meeki_34855"       
[16] "C_gentianus_34953"    "C_gentianus_13530"    "C_nigromaxilla_15892"
[19] "C_collectoris_33922"  "C_collectoris_36115"  "C_dispar_5611-2"     
[22] "C_malaitae_32790-2"  
 [1] solitarius   solitarius   nigromaxilla margarethae  margarethae 
 [6] sacerdotis   mulcatus     mulcatus     sacerdotis   malaitae    
[11] coll.west    coll.west    meeki        meeki        gentianus   
[16] gentianus    nigromaxilla coll.east    coll.east    dispar      
[21] malaitae    
11 Levels: coll.east coll.west dispar gentianus malaitae margarethae ... solitarius
33.26% of SNPs fell below a minor allele count of 1 and were removed from the VCF

 [1] "FORMAT"               "C_sacerdotis_5317"    "C_solitarius_9572"   
 [4] "C_margarethae_27245"  "C_mulcatus_27674"     "C_mulcatus_27746"    
 [7] "C_sacerdotis_29529"   "C_meeki_32014"        "C_malaitae_32770"    
[10] "C_nigromaxilla_32846" "C_collectoris_33842"  "C_meeki_34862"       
[13] "C_gentianus_34953"    "C_gentianus_35042"    "C_collectoris_36076" 
[16] "C_collectoris_36129"  "C_solitarius_7295"    "C_margarethae_14484" 
[19] "C_nigromaxilla_15880" "C_collectoris_33863"  "C_dispar_5611-2"     
[22] "C_malaitae_32790-2"  
 [1] sacerdotis   solitarius   margarethae  mulcatus     mulcatus    
 [6] sacerdotis   meeki        malaitae     nigromaxilla coll.west   
[11] meeki        gentianus    gentianus    coll.east    coll.east   
[16] solitarius   margarethae  nigromaxilla coll.west    dispar      
[21] malaitae    
11 Levels: coll.east coll.west dispar gentianus malaitae margarethae ... solitarius
35.04% of SNPs fell below a minor allele count of 1 and were removed from the VCF

 [1] "FORMAT"               "C_solitarius_4846"    "C_sacerdotis_5317"   
 [4] "C_solitarius_7629"    "C_margarethae_27254"  "C_sacerdotis_27646"  
 [7] "C_mulcatus_27686"     "C_mulcatus_27839"     "C_meeki_32007"       
[10] "C_meeki_32024"        "C_malaitae_32770"     "C_nigromaxilla_32860"
[13] "C_collectoris_33759"  "C_collectoris_33789"  "C_gentianus_34953"   
[16] "C_gentianus_34969"    "C_collectoris_36096"  "C_nigromaxilla_15880"
[19] "C_margarethae_19259"  "C_collectoris_36115"  "C_dispar_5611-2"     
[22] "C_malaitae_32790-2"  
 [1] solitarius   sacerdotis   solitarius   margarethae  sacerdotis  
 [6] mulcatus     mulcatus     meeki        meeki        malaitae    
[11] nigromaxilla coll.west    coll.west    gentianus    gentianus   
[16] coll.east    nigromaxilla margarethae  coll.east    dispar      
[21] malaitae    
11 Levels: coll.east coll.west dispar gentianus malaitae margarethae ... solitarius
Code
#read in last vcf file read to disk, to check that it looks right
z <- read.vcfR(paste0("~/Desktop/nmel.ceyx.rad/snapp/rep",i,".vcf.gz"))
Scanning file to determine attributes.
File attributes:
  meta lines: 14
  header_line: 15
  variant count: 1684
  column count: 30

Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
  Character matrix gt rows: 1684
  Character matrix gt cols: 30
  skip: 0
  nrows: 1684
  row_num: 0

Processed variant 1000
Processed variant: 1684
All variants processed
Code
colnames(z@gt)
 [1] "FORMAT"               "C_sacerdotis_5307"    "C_solitarius_5447-8" 
 [4] "C_nigromaxilla_15907" "C_margarethae_27254"  "C_sacerdotis_27646"  
 [7] "C_mulcatus_27674"     "C_mulcatus_27852"     "C_meeki_32023"       
[10] "C_malaitae_32770"     "C_collectoris_33789"  "C_collectoris_33871" 
[13] "C_meeki_34870"        "C_gentianus_34953"    "C_gentianus_34969"   
[16] "C_collectoris_36072"  "C_collectoris_36212"  "C_solitarius_6982"   
[19] "C_nigromaxilla_15892" "C_margarethae_19259"  "C_dispar_5611-2"     
[22] "C_malaitae_32790-2"  
Code
z
***** Object of Class vcfR *****
21 samples
1630 CHROMs
1,684 variants
Object size: 3.7 Mb
4.044 percent missing data
*****        *****         *****
Code
#read in last popmap read to disk, to check that it looks right
read.table(paste0("~/Desktop/nmel.ceyx.rad/snapp/popmap.rep",i,".txt"))
             V1                   V2
1       species           individual
2    sacerdotis    C_sacerdotis_5307
3    solitarius  C_solitarius_5447-8
4  nigromaxilla C_nigromaxilla_15907
5   margarethae  C_margarethae_27254
6    sacerdotis   C_sacerdotis_27646
7      mulcatus     C_mulcatus_27674
8      mulcatus     C_mulcatus_27852
9         meeki        C_meeki_32023
10     malaitae     C_malaitae_32770
11    coll.west  C_collectoris_33789
12    coll.west  C_collectoris_33871
13        meeki        C_meeki_34870
14    gentianus    C_gentianus_34953
15    gentianus    C_gentianus_34969
16    coll.east  C_collectoris_36072
17    coll.east  C_collectoris_36212
18   solitarius    C_solitarius_6982
19 nigromaxilla C_nigromaxilla_15892
20  margarethae  C_margarethae_19259
21       dispar      C_dispar_5611-2
22     malaitae   C_malaitae_32790-2

3 visualize a lognormal distribution centered on 3.9, ranging from ~3-5, the age estimate of this clade from McCullough et al. 2021

Code
#paper that this distribution is based on can be found here: https://royalsocietypublishing.org/doi/full/10.1098/rspb.2019.0122

#use the BEAUTI2 GUI to visualize a distribution where 95% of the probability density curve approximates the 95% HPD estimated from the above paper:
knitr::include_graphics("/Users/devonderaad/Desktop/beauti.lognormal.95.png")

Code
#ideal parameters are mean = 3.9, stdev = 0.15

4 make the constraint and starting tree files, which will be used for all five reps

Code
#make date constraint file called 'date.con.txt'
#make line 1: distribution of the node constraint, the word "crown", and the list of species that are in this crown clade
date.con<-c("lognormal(0,3.9,.15)","crown","solitarius,nigromaxilla,sacerdotis,mulcatus,meeki,malaitae,coll.west,coll.east,gentianus,dispar")
#make line 2: the word "monophyletic", "NA", and the list of species to enforce as monophyletic
date.con<-rbind.data.frame(date.con,c("monophyletic","NA","solitarius,nigromaxilla,sacerdotis,mulcatus,meeki,malaitae,coll.west,coll.east,gentianus,dispar"))
#write to disk
#write.table(date.con, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/date.con.txt"), sep = "\t", quote = F, row.names = F, col.names = F)

#now make a starting tree file called tre.start.txt based on the concatenated ML topology
#write tree
t<-"(margarethae,((mulcatus,solitarius),(sacerdotis,(dispar,((malaitae,meeki),(gentianus,(nigromaxilla,(coll.east,coll.west))))))))"
#write.table(t, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/tre.start.txt"), sep = "\t", quote = F, row.names = F, col.names = F)

Notes on running SNAPP or Snapper with a date constrained node, starting tree, and constraint tree. What I’ve put together here is based on the following two resources:

  • tutorial on divergence time estimation using the ruby scrpt snapp_prep: https://github.com/mmatschiner/tutorials/blob/master/divergence_time_estimation_with_snp_data/README.md

  • github page for that script: https://github.com/mmatschiner/snapp_prep

My workflow is as follows, and below is the most “constrained” version I’ve run. The simplest version is just with a date constraint.

5 code to generate input xml files

Code
#when I run this ruby script it references four files
#-v (input data; thinned VCF)
#-c (date constraint file; this includes the topology constraint)
#-s (starting tree file)
#-t (popmap)
#plus, three flags
#-x (name of the output xml file including a .xml extension)
#-o (the name prefix to append to log and tree files)
#-l (MCMC length)

#the date file is called 'date.con.txt' and it looks like this:
#lognormal(0,0.56,0.10)  crown   kul,par,tet,spl
#monophyletic    NA eic,gri,lon,mur

#this basically says, put a date constraint on the node that comprises those four tips (kul,par,tet,spl)
#and your constraint is centered on 0.56 MA, with a stdev of 0.10, and zero offset, and your distribution is lognormal
#the second line is the tree constraint which says keep that group of four tips monophyletic
#to enforce a root, you could just force all your ingroup tips to be monophyletic

#the popmap file is a standard tab-separated popmap, e.g.,:
#species individual
#eic     Z_gr329017.sorted
#pal     Z_gr330116.sorted
#eic     Z_gr335071.sorted

#finally, specify a starting tree (not a constraint tree) which must match the topology specified in the date constraint file above, i.e., you do have to specify a start tree in order to have overlapping starting space with your other topological constraints. Example:
#((vel,(lut,((spl,(tet,par)),kul))),(pal,(mur,(eic,(gri,lon)))))

#once you have all these files in one directory you simply run the below text to make five replicate input xml files
#download ruby script
wget https://raw.githubusercontent.com/mmatschiner/snapp_prep/master/snapp_prep.rb

#view help menu
ruby snapp_prep.rb -h

#unzip each output vcf
gunzip rep1.vcf.gz
gunzip rep2.vcf.gz
gunzip rep3.vcf.gz

#prepare each input .xml
ruby snapp_prep.rb -a SNAPP -v rep1.vcf -t popmap.rep1.txt -c date.con.txt -s tre.start.txt -x rep1.xml -o rep1 -l 5000000
ruby snapp_prep.rb -a SNAPP -v rep2.vcf -t popmap.rep2.txt -c date.con.txt -s tre.start.txt -x rep2.xml -o rep2 -l 5000000
ruby snapp_prep.rb -a SNAPP -v rep3.vcf -t popmap.rep3.txt -c date.con.txt -s tre.start.txt -x rep3.xml -o rep3 -l 5000000

#and if you wanted to produce an xml that will run Snapper instead of SNAPP you just add the "-a SNAPPER" flag included, see the snapp_prep.rb help menu

6 Then start 3 replicate SNAPP runs as an array on the cluster using the following code:

Code
#!/bin/sh
#
#SBATCH --job-name=snapp               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=25               # 40 CPU allocation per Task
#SBATCH --partition=bi            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/nmel.ceyx/snapp    # Set working d$
#SBATCH --mem-per-cpu=800            # memory requested
#SBATCH --array=1-3
#SBATCH --time=10000

#run beast 2.7.1
/home/d669d153/work/beast.2.7.1/beast/bin/beast -threads 25 rep$SLURM_ARRAY_TASK_ID.xml

7 Check out the three log files using Tracer

Code
#Determine which of the replicates (here, rep 3) had the greatest posterior probability and should be presented in the paper
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/trace.posteriors.png")

7.1 Make sure all rep 3 parameter estimates converged

Code
#Determine which of the replicates (here, rep 3) had the greatest posterior probability and should be presented in the paper
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep3.trace.png")

7.2 Make sure all three individual runs converged on similar topologies

Code
#rep1
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep1.png")

Code
#rep2
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep2.png")

Code
#rep3
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep3.png")

7.3 Make a consensus tree from all samples from the posterior distribution for rep 3 and visualize it in figtree with posterior probabilities labeled.

Code
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep3.contree.png")